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Washington, DC 20231 



Sir: 



In response to the Office action mailed on September 23, 2002, having a 
shortened statutory response period set to expire on December 23, 2002, a three (3) 
month extension of time up to and including March 23, 2003 being submitted herewith 
this Amendment, please amend the above identified application as follows: 
IN THE DRAWINGS 

Please add Figures 12A-34E in accordance with the attached sheets of Figures 



If there are any outstanding issues that might be resolved by an interview or an 
Examiner's amendment, the Examiner is requested to call Applicants' attorney at the 
telephone number shown below. 



12A-20E. 



Respectfully submitted, 



Date: /W H Uol 




Lawrence T. Cullen 
Registration No. 44,489 



600 13th Street, N.W. , Suite 1200 
Washington, D.C. 20006-3096 
Telephone: (202) 756-8000 
Facsimile: (202) 756-8087 
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Definition 



def WideSwitch(op,rd r rc > ib > ra) : / : • 

d 4- RegRcad(id, 128) : 
c 4- RegRead(rc, 64) 
b <- RegRcad(rb, 128) 
if c i.o^O then 

raise AccessDisaUowedByVirtualAddress 
elseif C6 .o * 0 then 

VirtAddr <- c and (c-1) 

w <— wsize <— (c and (0-c)) || 0 l 

else 

VirtAddr <- c 
w wsize <- 128 

endif 

msize <— 8*wsize 
iwsize *- log(wsize) 
case op of 

W.SWITCH.B: 
order <- B 
W.SWITCH.L: 
order <- L 

endcase 

m <— LoadMemory(c,VirtAddr,msize,order) 
db 4- d || b 
fori<-0to 127 

j<-0|| Mwsize-L.O 

k <- m7* w+ j||m^* w+ j||ni5* W 4.j||m4* w+ j||m3* w+J ||m2*w+jl|mw+^ 
1 <~ *7. .Iwsize II jlwsize-L.O 
<— dbi 

endfor 

RegWrite(ra, 128, a) 



enddef 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 



# / # 



l2(o 



Operation codes 



W.TRANSLATE.8.B 


Wide translate bvtes big-endian/ 


W.TRANSLATE.16.B 


Wide translate doublets big-endian 


W.TRANSLATE.32.B 


Wide translate quadlets big-endian 


W.TRANSLATE.64.B 


Wide translate octlets big-endian 


W.TRANSLATE.8.L 


Wide translate bytes little-endian 


W.TRANSLATE. 1 6. L 


Wide translate doublets little-endian 


W.TRANSLATE.32.L 


Wide translate quadlets little-endian 


W.TRANSLATE.64.L 


Wide translate octlets little-endian 


Selection 


class 


size 


order 


Wide translate 


8 16 32 64 


B L 


Format 






W.TRANSLATE.size.order rd=rc,rb 




rd=wtranslatesizeorder(rc,rb) 
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Definition 



def WideTranslate(op t gsize,rd t rc,rb) 
c <- RegRead(rc, 64) 
b <- RegRead(rb, 128) 
Igsize <- log(gsize) 

if cigsize-4..0 * 0 tt* en 

raise AccessOisailowedByVtrtualAddress 

endif 

if C4..lgsize-3*° ^en 

wsize<-(c and (0-c)) || 0 3 
t«-cand(c-1) 

else 

wsize *~ 128 
t<-c 

endif 

Iwsize <- log(wsize) 

if t|wsize+4..lwsize-2 * 0 then 

msize<-(t and (0-t)) || 0 4 

VirtAddr <- 1 and (t-1) 

else 

msize * - 256*wsize 
VirtAu. 

endif 

case op of 

W.TRANSLATE.B: 

order <- B 
W. TRANSLATE. L: 
order <- L 

endcase 

m <- LoadMemory(c,VirtAddr,msize,order) 

vsize <r- msize/wsize 

Ivsize <- log(vsize) 

f 0r j <_ o to 128-gsize by gsize 

j <- ((order=B) lvsize ) A (b tV size-Ui..i))* wsize+i !wsize-1..0 

agsize-1+i..i *~ m j+gsize-1..j 
endfor 

RegWrite(rd, 128, a) 
enddef 
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Exceptions 

Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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W.MULMAT.8.B 



W.MULMAT.8.L 



W.MUL.MAT.16.B 



W.MUL.MAT.16.L 
W.MULMAT.32.B 
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M.32.B 
M.32.L 



W.MULMAT.P.8.B 
W.MULMAT 
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W. ML) L. MAT. P. 1 6. B 



W.MUL.MAT.P.16.L 
W.MULMAT.P.32.B 



W.MUL. 
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MAT. P. 32. L 
MAT.U.8.B 



W.MUL 
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.U.8.L 



MAT. 
MAT.U.16.B 



U.16.L 



W.MUL 

W.MUL 

W.MUL.MAT.U~32.L 



.MAT 
.MAT 



U.32.B 



Wide multiply matrix sioned,byte b i ^ndian 
Wide multiply mat rix sioned^vtelitMndian 



Wide multiply matrix signed doublet big-endian 
Wide multiply matrix signed doublet little-endian 



Wide multiply matrix signed quadlet big-endian 



Wide multiply matrix mixed-signed quadlet big-endian 
Wide multiply matrix mixed-signed quadlet little-endian 



Wide multiply matrix polynomial byte big-endian 



Wide multiply matrix polynomial byte little-endian 



Wide multiply matrix polynomial doublet big-endian 



Wide multiply matrix polynomial doublet little-endian 



Wide multiply matrix polynomial quadlet big-endian 



Wide multiply matrix polynomial quadlet little-endian 



Wide multiply matrix unsigned byte big-endian 



Wide multiply matrix unsigned byte little-endian 



Wide multiply matrix unsigned doublet big-endian 



Wide multiply matrix unsigned doublet little-endian 



Wide multiply matrix unsigned quadlet big-endian 



Wide multiply matrix unsigned quadlet little-endian 
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Format 



W.op.size. order rd=rc,rb 
rd=wopsizeorder(rc,rb) 
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m[rc](64 # 64/size) 
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Wide multiply matrix complex 



Definition 

def muKsize.h.vs.v.i.ws.wj) as 

mul <- ((vs&Vsfce-i+O^size || v size -iH.i) * ((ws&w size .l+j) h - size II w size -i + j..j) 
enddef 

def c <- PolyMultiplyfsize.a.b) as 
p[0] <- 0 2#size 
for k <- 0 to size-1 

p[k+1 ] <r- p[k] A a k ? (Qsize-k || b || 0 k ) : 0 2 * size 
endfor 

c«-p[size] i 
enddef ^ 

def WideMultiplyMatrix(major ( op ) gsize t rd ( rc l rb) 
d <- RegRead(rd ( 128) 
c <- RegRead(rc, 64) 
b <- RegRead(rb, 128) 
Igsize <- log(gsize) 

if cig S i2e-4..0 * 0 

raise AccessDisaliowedByVirtualAddress 

endif 

if C2..|g S ize-3*0 then 

wsize <- (c and (0-c)) || 0 4 
t<-cand (c-1) 

else 

wsize <- 64 
t<-a 

endif 

Iwsize <r- log(wsize) 

if t|wsize+6-lgsize..lwsize-3 * 0 then 

msize <- (t and (0-t)) || 0 4 

VirtAddr <- t and (t-1) 

else 

msize <- 128*wsize/gsize 
VirtAddr <- 1 

endif 

case major of 

W.MINOR.B: 

order <- B 
W.MINOR.L: 

order <- L 

endcase 
case op of 

W.MULMAT.U.8, W.MULMAT.U.16, W.MUL.MAT.U.32, W.MUL.MAT.U.64: 
ms <- bs «- 0 

W.MULMAT.M.8, W.MULMAT.M.16, W.MULMAT.M.32, W.MUL.MAT.M.64: 
ms <- 0 
bs <- 1 

W.MUL.MAT.8, W. MUL. MAT. 16, W.MUL.MAT.32, W.MULMAT.64, 
W. MUL. MAT. C. 8, W. MUL. MAT. C. 16, W.MUL.MAT.C.32, W. MUL. MAT. C. 64: 
ms <— bs <— 1 

W. MUL. MAT. P. 8, W. MUL. MAT. P. 16, W. MUL. MAT. P. 32, W. MUL. MAT. P. 64: 
endcase 



r-ic. i 1 / » 



m ^- LoadMemoryfc.VirtAddr.msize.order) 
h «- 2*gsize 

for i 0 to wsize-gsize by gsize i 

q[0]<-0 2# S Size s 

for j <- 0 to vsize-gsize by gsize 
case op of 

W.MULMAT.P.8, W.MULMAT.P.16, W.MULMAT.P.32, W.MULMAT.P.64: 
k<-i+wsize*j 8 ..igsize 

qQ+gsize] <- q[j] A PolyMultiply(gsize,mk+gsize-1 ..k.bj+gsize-1 
W.MULMAT.C.8, W.MULMAT.C.16, W.MULMAT.C.32, W.MULMAT.C.64: S ^ 
if (H) & j & gsize = 0 then 

k <- i-(j&gsize)+wsize*j8..igsize+l 

q[j+gsize] <r- q[j] + muKgsize.h.ms.mXbs.bj) 

else 

k <- i+gsize+wsize*j8..igsize+1 

qO+gsize] <- q[j] - muKgsize.h.ms.m.k.bs.bJ) 

endif 

W.MULMAT.8, W.MUL.MAT.16, W.MULMAT.32, W.MUL.MAT.64, 
W.MULMAT.M.8, W.MULMAT.M.16, W.MULMAT.M.32, 

W.MULMAT.M.64, 

W.MULMAT.U.8, W.MULMAT.U.16, W.MULMAT.U.32, W.MULMAT.U.64: 
q[j+gsize] <- q[j] + muKgsize.h.ms.m.i+wsize^s.jgsize.bs.bj) 

endfor 

a2*gsize-1+2*i..2*i <~ q[vsize] 
endfor 

ai27..2*wsize <- 0 
RegWrite(rd, 128, a) 
enddef 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 



W.MULMAT.X.B 


Wide multiply matrix extract big-endian 


W.MULMAT.X.L 


Wide multiply matrix extract little-endian 



Selection 



class 


op 


order 


Multiply matrix extract 


W.MUL.MAT.X 


B L 



Format 

W.op. order ra=rc,rd l rb 
ra=wop(rc,rd,rb) 
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Wide multiply matrix extract doublets 



Definition A a 

def mul(size ) h l vs t v l i ( ^^j) as W 

mul «- ((vs&v size -i + i) h - size II Vsize-1+Li) * ((ws&w siz e-1+j)^ size II Wsize-1+j..j) 
enddef 

defNAfideMultiplyMatrixExtracttop.ra^.rc.rd) 
d <- RegRead(rd, 128) 
c <- RegRead(rc, 64) 
b <- RegRead(rb, 128) 
case b8..o of 
0..255: 

sgsize*- 128 { 
2S6..383: 

sgsize <- 64 

384..447: ? 

sgsize*- 32 
448..479: 

sgsize*- 16 

480..495: ' ' x 

sgsize <- 8 
496..503: 

sgsize <- 4 
504.. 507: 

sgsize <- 2 
508..511: 

sgsize <- 1 

endcase 
I <- bn 
m <- bi2 
n <- bi3 
signed <- bi4 
if C3 .0 * 0 then 

wsize <- (c and (0-c)) || 0 4 

t <- c and (c-1) 

else 

wsize <- 128 
t <- c 

endif 

if sgsize < 8 then 

gsize <- 8 
elseif sgsize > wsize/2 then 

gsize <- wsize/2 

else 

gsize <- sgsize 

endif 

Igsize <- log(gsize) 
Iwsize <- log(wsize) 
if t|wsize+6-n-lgsize..lwstze-3 * 0 then 

msize <- (t and (0-t)) || 0 4 

VirtAddr <- t and (t-1) 

else 

msize <- 64*(2-n)*wsize/gsize 
VirtAddr <- t 

endif 



vsize <- (1+n)*msize*gsize/wsize 

mm <- LoadMemoryfc.VirtAddr.msize.order) 

h <- (2*gsize) + 7 - Igsize 

Imsize <- log(msize) ; •} -6 

if (VirtAddrimsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress • 

endif 

case op of 

W.MULMAT.X.B: 
order <r- B 
' W.MULMAT.X.L: 
order <- L 

endcase 

ms <~ signed 

ds <- signed A m 

as <- signed or m 

spos <- (b 8 ..o) and (2*gsize-1) 

dpos <- (0 || b 2 3..i6) and (gsize-1) 

r <- spos 

sfsize <r~ (0 || b 3 i'.24) and (gsize-1) 

tfsize <- (sfsize = 0) or ((sfsize+dpos) > gsize) ? gsize-dpos ■ sfsize 
fsize <- (tfsize + spos > h) ? h - spos : tfsize 
if ( b l0..9 = Z) & -signed then 
rnd <- F 

else 

rnd <- bio..9 

endif 



(L [SC ( Con 



for i <- 0 to wsize-gsize by gsize 
q[0] <- 0 2 *9s«ze+7-lgsize 

forj <- 0 to vsize-gsize by gsize 
if n then 

'f(~i)&j&gsize = Othen » 
k <- •-(i&gsize)^size*j 8J g SlZ e +1 

else qD+9SlZeI *~ qDI + mul (9size,h,ms,mm 1 k.as-cJ j) 
k<-i+gsize4 W size«j 8JgS i ze+1 
qO+gsize] <- q[j] - mu!(gsize,h,ms,mm.k > ds 1 d j) 

else 

endif qD+9Si2e] qDl + mu, (9si2e,h,ms 1 mm l i + j* W size/gsi 2 e,ds t d j) 
endfor 

P<-q[128] 
case rnd of 
none, N: 

s<-0»"|| -p r ||p|:-1 

Z: 

s<-0h-r|| 

F: 

s <- 0 h 

C: 

s <— 0 h r || V 

endcase 

v^((ds&p h . 1 )|| P ) + (0||s) 

if (Vh..r+fsize = (as & v r+ f si2e _i) h+1 - r -fsize) or not j then 

w <_ (as & v r+ f S i ze .i)9 siz e- f si2e-dpos y v fsize . 1+r r || o^pos 

else 

w <r- (s ? (Vh || -vfl size - d P° s - 1 ) ; igsize-dpos) || gdpos 

endif 

asize-1+i..i <~ w 
endfor 

a 127..wsize <~ 0 
RegWrite(ra, 128, a) 
enddef 



(( {'><£ C Co* 1 / ^ JC ") 



Exceptions 

Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 

W.MULMAT.X.I.8.B 

W.MULMAT.X.I.8.L 

W.MULMAT.XJ.16B 

W.MULMATXL16L 

W.MULMAT.XJ.32.B 

W.MULMAT.X.I.32.L 

W.MULMAT.XI.64 B 

W.MULMATJCI.64.L 



Wide multiply 
Wide multiply 
Wide multiply 
Wide multiply 
Wide multiply 
Wide 

Wide multiply 



matrix extract immediate signed bytes big-e ndian 
matrix extract immediate signed bytes little-end ian~ 
matrix extract immediate signed doublets big -endian" 
matrix extract immediate signed doublets little -endian" 
matrix extract immediate signed quadiets big-endia n ' 
Jried quadlets littie-endia n 
matrix extract lmmedlate$(qned octlets big-endian" 



Wide multiply matrix extract immediate signed octiets little-endian ' 



W.MULMAT.X.1.C.8.B 
W.MULMAT.XJ.C.8.L 
W.MULMATXLC.16.B 
W.MULMAT.XI.C.16.L 
W.MULMATXLC.32.B 
W.MULMAT.X.LC.32.L 



Selection 



Wide multiply matrix extract Immediate 
Wide multiply matrix extract immediate 
Wide multiply matrix extract immediate 
Wide multiply matrix extract immediate 
Wide multiply matrix extract immediate 
Wide multiply matrix extract immediate 



complex bytes big-endian 
complex bytes little-endian 
complex doublets big-endian * 
complex doublets little-endia n 
complex quadlets big-endian 
complex quadlets little-endian 
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op 
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size 


order 


wide multiply matrix 
extract immediate 


W.MUL.MAT.X.I 
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LB 



Format 

W.op.tsize.order rd=rc,rb,i 
rd = wopts izeorde r(rc, rb , i) 

31 2423 

I W.op.order" 

8 



18 17 



12 11 



6 5 4 32 



rd 



rc 



rb 1 1 1 sz | sh 



sz <r- log(size) - 3 
assert size+3 > i > size-4 
sh <- i - size 



1029 mfrc1M28*128/Btyo\ 
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rb(128) 
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0 



rd(128) 

Wide multiply matrix extract immediate doublets 



128 rd(128) 

Wide multiply matrix extract immediate complex doublets 



Definition v 

def muKsize.h.vs.v.i.ws.wj) as 

mul <- ((vs&v si2e _ 1+i )h-size „ Vsbe . 1+U ) * ((ws&w S ize-1 + i) h - size II 
enddef 

defWideMultiplyMatrixEx^^ 
c <- RegRead(rc t 64) 
b <- RegRead(rb, 128) 
Igsize <- log(gsize) 
case type of 
NONE: 

if cjgske-4..0 * 0 then v . v 

raise AccessDisallowedByVirtualAddress 
endif ^ 

if C3..| gs ize-3*0then 

wsize <- (c and (0-c)) || 0 4 
t«-cand(c-1) - 

else 

wsize <- 128 
t«-c 

endif 

Iwsize <- log(wsize) 

if tlwsize+G-lgsize.Jwsize-S * 0 then 

msize <- (t and (0-t)) || 0 4 

VirtAddr <- 1 and (t-1) 

else 

msize <- 128*wsize/gsize 
VirtAddr <- t 

endif 

vsize <r- msize*gsize/wsize 

C: 

if c lgsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress 

endif 

'f c 3..lgsize-3 * 0 then 

wsize <- (c and (0-c)) || 0 4 
t <r~ c and (c-1) 

else 

wsize <- 128 
t <-c 

endif 

Iwsize <~ log(wsize) 

'f Wsize+5-lgsize..lwsize-3 * 0 then 

msize 4- (t and (0-t)) || 0 4 

VirtAddr^ t and (t-1) 

else 

msize <- 64*wsize/gsize 
VirtAddr <- t 

endif 

vsize <— 2*msize*gsize/wsize 

endcase 



case op of 

W.MULMATXI.B: 

order B 
W.MULMAT.X.I.L: 

order <r- L 

endcase 

as <- ms <- bs <r- 1 

m *- LoadMemoryCcVirtAddr.msize.order) , 
h <- (2*gsize) + 7 - Igsize - (ms and bs) 
r <- gsize + (sh>j ||sh) 

for i <- 0 to wsize-gsize by gsize ' - ■ 

q[0] <- o 2 *9 size+7 - , 9 size 
for j 4- 0 to vsize-gsize by gsize 
case type of 
NONE: 

q[j+gsize] <- q[j] + multgsize.h.ms.m.i+wsize^s.jgsize.bs.bj) 

C: 

if (~i) & j & gsize = 0 then 

k <- i-(j&gsize)+wsize*j 8 ..|gsize+1 

q[j+gsize] q[j] + muKgsize.h.ms.m.k.bs^j) 

else 

k <- i+gsize+wsize*j8..i g size+l 

qO+gsize] <- qO] - muKgsize.h.ms.m.k.bs.bj) 

endif 

endcase 
endfor 
p <- q[vsize] 
s <- 0 h ' r || ~p r || pf" 1 
v ((as & Ph-l)llP) + (Oils) 
if (v h .. r+ gsize = (as & v r+ gsize-l) h+1 ' r ' gsize then 

agsize-1+i..i <~ v gsize-1+r..r 

else 

a gs ize-Ui..i^as9(v h ||-vH size - 1 ):l9 size 

endif 
endfor 

ai27..wsize <- 0 
RegWrite(rd, 128, a) 
enddef 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 



Operation codes 



W.MUL.MAr.U.r.lo.B 


wide muiupiy mainx complex iioaung-poini nan uig-ejiuian 


\A/ Ml II MAT P P 1ft I 


Wide multinfv matrix comnley floatinn-nnint half liftfp-pnrfiari 


W.MULMAT.C.F.32.B 


Wide multiply matrix complex floating-point single big-endian 


W.MULMAT.C.F.32.L 


Wide multiply matrix complex floating-pOint single little-endian 


W.MUL.MAT.F.16.B 


Wide multiply matrix floating-point half Big-endian 


W.MUL.MAT.F.16.L 


Wide multiply matrix floating-point half little-endian 


W.MULMAT.F.32.B 


Wide multiply matrix floating-point single big-endian 


W.MULMAT.F.32.L 


Wide multiply matrix floating-point stogie little-endian 


W.MULMAT.F.64.B 


Wide multiply matrix floating-point double big-endian 


W.MULMAT.F.64.L 


Wide multiply matrix floating-point double little-endian 



Selection 



class 


op 


type 


prec 


order 


wide multiply matrix 


W.MULMAT 


F 


16 32 64 


LB 


C.F 


16 32 


LB 



Format 

W. op. prec. order rd=rc,rb 

rd=wopprecorder(rc,rb) 

31 2423 18 17 12 11 6 5 21 0 



W.MINOR.order 


rd 


rc 


rb 


W.op 


P r 


8 


6 


6 


6 


4 


2 



pr <- log(prec) - 3 




f/6 ' ^ G 



511 



rc(64*128/size) 
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rd(128) o 

Wide multiply matrix complex floating-point half 



Definition 





def muKsize.v.i.wj) as ^ 

mul ^- fmul(F(size ( v S ize-l+i..i),F(si2e l w size . 1+ j..j)) 
enddef 

def WideMultiplyMatrixFloatingPointfmajor.op.gsize.ixl.rc.rb) 
c <- RegRead(rc, 64) 
b <- RegRead(rb, 128) 
Igsize «- log(gsize) 
switch op of 

W.MULMAT.F.16, W.MULMAT.F.32, W.MULMAT.F.64: 
if C|gsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddf^i " 

endif 

ifc3..lgsize-3*0then 

wsize <- (c and (Oc)) || 0 4 
t<-cand(c-1) 

else 

wsize <r- 128 
t<-c 

endif 

Iwsize ^- log(wsize) 

if tiwsize+6-Igsize..lwsize-3 * 0 then 

msize <- (t and (0-t)) || 0 4 

VirtAddr <-t and (t-1) 

else 

msize <- 128*wsize/gsize 
VirtAddr <-t 

endif 

vsize <- msize*gsize/wsize 
W.MULMAT.C.F.16, W.MUL.MAT.C.F.32, W.MULMAT.C.F.64: 
if c lgsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress 



endif 

lf c 3. Jgsize-3 * 0 then 

wsize <- (c and (0-c)) || 0 4 
t <r- c and (c-1) 

else 

wsize <- 128 
t<-c 

endif 

Iwsize <r- log(wsize) 

if t|wsize+5-lgsize..lwsize-3 * 0 then 
msize <- (t and (0-t)) || 0 4 
VirtAddr <- t and (t-1) 

else 

msize ^- 64*wsize/gsize 
VirtAddr <- t 

endif 

vsize <- 2*msize*gsize/wsize 



endcase 



case major of 
M.MINOR.B: 

order <~ B 
M.MINOR.L: 

order <- L ' 

endcase 

m <- LoadMemory(c,VirtAddr ( msize l order) 
for i <- 0 to wsize-gsize by gsize 
q[0].t<-NULL 

for j *- 0 to vsize-gsize by gsize 
case op of 

W.MULMAT.F.16, W.MULMAT.F.32, W.MULMAT.F.64: 

qQ+gsize] <- fadd(q[j], mul(gsize,mj+wsize*j8 jgsize.bj)) 
W.MULMAT.C.F.16, W.MULMAT.C.F.32, M.MULMAT ;C.F.64: 
if (H) & j & gsize = 0 then 

k +- i-0*&gsize)+wsize*j8..ig S ize+l 
qO+gsize] <- fadd(qO], muKgsize.m.k.bj)) 

else 

k <- i+gsize+wsize*j8..ig S ize+l 
q[j+gsize] ^- fsub(q[j], muKgsize.m.k.bJ)) 

endif 

endcase 
endfor 

agsize-1+Li <- q[vsize] 
endfor 

ai27..wsize <- 0 
RegWrite(rd, 128, a) 
enddef 
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Fyceptions 

Floating-point arithmetic 
Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 



Operation codes 



W.MULMAT.G.8.B 


Wide multiply matrix Galois byte$ big-endian 


W.MULMAT.G.8.L 


Wide multiply matrix Galois bytes little-endian 



Selection 



class 


op 


size 


order 


Multiply matrix Galois 


W.MULMAT.G 


8 


B L 



Format 



W.op.order ra=rc,rd,rb 



ra=woporder(rc,rd,rb) 

31 2423 

~ W.op.order | 



18 17 



12 11 
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rb 



ra 
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m[rc1(128*128/size) 



OlllOlGlLll 
EGGGGGGOiH 
GBBBCGDCLU 
OODDDOCOObL 



OOOQGDBDOD 
BOBDODBDOC 
BBBOBBBDGG 
BSBQBBDDBO 
iDBDBBBQGL 
BOBOGDCOOl 
BDBBBIBDBO 
IBBiEIUBO 



.•LGLLGlGGGGGGOGG 

"COCrnrnnnnGDOrG 
IIGGGOllGGGGGDOC 



iGGDGDOOGDDGODDGD 
3BGBBGEGBBBB0BGBB 
IBBBBBOCOBOBBBOBB 
iOGBCeOODOOOGGOGO 
:BGGGDOGGBGBBG0GO 
3G0BBBBGB0BBBBBBG 
IGDBDGGGOGGBBGGBG 
jGBGGGBGOGQBSBOGB 
IBBIlBBBBIIlgiOi 



jrGOIjD 

mm 
mm 
mm 

'. LiLj Cii GO 

ionoonn 
:gogobi 

iBBEOGI 
iOBBBBi 
:GODDDI 

mmm 

IBBBBBl 
iOBiiBI 
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Wide multiply matrix Galois byte 



rib 




Definition 

def c «- PolyMultiply(size f a f b) as 

p[0] 0 2 * size 

for k <- 0 to size-1 

p[k+1] <- p[k] * a k ? (0S»ze-k jj b j| 0 k) . 0 2*size 

endfor . 

c <- pfsize] 
enddef 

def c <- PolyResidue(size,a,b) as 
p[0]<-a 

for k <- size-1 to 0 by -1 

p[k+1] <- p[k] * p[0] size+k ? (Qske-k || 11 1| b || 0 k ) : 0 2 *size 
endfor 

c<-p[size] S i 2e -1..0 
enddef 

defWideMultipIyMatrixGalois^p.gsize^d.rc.rb^) 
d <- RegRead(rd t 128) 
c <~~ RegRead(rc ( 64) 
b <- RegRead(rb, 128) 
Igsize ^- log(gsize) 
ifc lgsize-4..0^0 then 

raise AccessDisallowedByVirtualAddress 

endif 

if c 3..lgsize-3 * 0 then 

wsize <r~ (c and (0-c)) || 0 4 
t <- c and (c-1) 

else 

wsize <r- 128 
t <- c 

endif 

Iwsize <- log(wsize) 

'f t|wsize+6-lgsize..lwsize-3 * 0 then 

msize <- (t and (0-t)) || 0 4 

VirtAddr <- t and (t-1) 

else 

msize <- 128*wsize/gsize" 
VirtAddr <- t 

endif 

case op of 

W.MULMAT.G.8.B: 

order <- B 
W.MULMAT.G.8.L: 
order <— L 

endcase 



m <- LoadMemoryfc.VirtAddr.msize.order) 
for i <- 0 to wsize-gsize by gsize 

q[0] *- 0 2 *9 si 2e f 

for j <- 0 to vsize-gsize by gsize ....*.'*.* 
k <- i+wsize*j 8 ..lgsize 

qD+gsize] <- qffl * PolyMultiplyCgsize.mk^e.!^!^^! j} 
endfor " J 

agsize-1+i..i <- PolyResidue(gsize,q[vsize],bg S i ze -i..o) 
endfor 

ai27..wsize <~ 0 
RegWrite(ra ( 128, a) 
enddef 



■ ICC f (^ nr,u Ob) 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global IB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 

E.MULAD D.3T 
! E.CON.X 



Format 

E.op rd@rc,rb,ra 
rd=gop(rd J rc,rb,ra) 

31 



Ensemble multiply add extra ct 
Ensemble convolve extract 



24 23 



18 17 



12 11 



6 5 



E.op 

8 



rd 

6 



rc 



rb 



ra 
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Ensemble multiply add extract doublets 



fit,- 1^- 
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rd(128) 

Ensemble complex multiply add extract doublets 

The ensemble-multiply-add-extract instructions (E.MUL.ADD.X), when the x bit is 

T h ^ l ° W ~° rder 64 bitS ° f CaCh ° f the rc and rb re g iste " produce 
extended (double-size) results. 




Ensemble convolve extract doublets 



rd(1 28) 

Ensemble convolve extract complex doublets 
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Definition 

def muKsize.h.vs.v.i.ws.wj) as 

mul <r- ((vs&v si2e .i + i) h - size || Vsto-1+u) * ((ws&wsize-l + jr size II w S ize-l+j..j) 
enddef 

def EnsembleExtractlnplaceCop.ra.rb.rc.rd) as 
d <_ RegRead(rd, 128) 
c <- RegRead(rc, 128) 
b <- RegRead(rb. 128) 

case a8..o of v 
0..255: . 

sgsize <- 128 
2S6..383: 

sgsize <- 64 
384..447: 

sgsize <- 32 
448..479: 

sgsize <r- 16 
480..495: 

sgsize <- 8 
496.. 503: 

sgsize <- 4 
504.. 507: 

sgsize <- 2 
508.. 511: 

sgsize <- 1 

endcase 
I <- an 
m <r- ai2 
n <-ai3 
signed <- ai4 
x<-ais 

case op of 

E.CON.X: 

if (sgsize < 8) then 

gsize <- 8 
elseif (sgsize*(n+1)*(x+1) > 128) then 
gsize <- 128/(n+1)/(x+1) 

else 

gsize <- sgsize 

endif 

igsize log(gsize) 

wsize<-128/(x+1) 

vsize <- 128 

ds <- cs <- signed 

bs <- signed A m 

zs <— signed or m or n 

zsize <- gsize*(x+1) 

h <- (2*gsize) + log(vsize) - Igsize 

spos <- (as..o) and (2*gsize-1) 




E.MULADD.X: 

if (sgsize < 8) then 

gsize <- 8 
elseif (sgsize*(n+1)*(x+1) > 128) then 

gsize «- 128/(n+1)/(x+1) 

else 

gsize «- sgsize 

endif 

ds <r- signed 

cs *- signed A m 

zs <- signed or m or n 

zsize<-gsize*(x+1) 

h <- (2*gsize) + n 

spos <r- (as..o) a nd (2*gsize-1) 

endcase 

dpos <- (0 || a 2 3..16) a nd (zsize-1) 
r <- spos 

sfsize <- (0 || 331..24) and (zsize-1) 

tfsize <r- (sfsize = 0) or ((sfsize+dpos) > zsize) ? zsize-dpos : sfsize 
fsize <r- (tfsize + spos > h) ? h - spos : tfsize 
if (bio..9 = Z) and not as then 
rnd <r~ F 

else 

rnd <r- bio..9 

endif 



for k <- 0 to wsize-zsi7^ s y zsize ^ 

i <- k*gsize/zsi^ ^ 

case op of 

E.CON.X: 

q[0] *- 02*gs\ze^7Ags\ze 
for j <- 0 to vsize-gsize by gsize 
if n then 

if (~i) & j & gsize = 0 then 

q[j+gsize] «- qO] + muKgsize.h^s.m.i+IZS-j.bs.bj) 

else 

qg+gsize] <- q[j] - mul(gsize l h l ms,m,i+128-j+2*gsize ( bs l b l j) 

endif 

else < 
qD+gsize] *- q[j] + mul(gsize ( h ( ms t m j+-12§-j,bs ( bj) 

endif 
endfor 
p <- q[vsize] 
E.MULADD.X: 

di <- ((ds and dk+zsize-I^^IKdk+zsize-L.iailOO 

if n then 

if (i and gsize) = 0 then 

p<-mul(gsize t h t ds t d ( i t cs t c,i)- 
mul(gsize 1 h ( ds ) d,i+gsize l cs l c l i+gsize)+di 

else 

p<-mul(gsize t h ) ds ) dj,cs 1 cj+gsize)+mul(gsize,h I ds ( d l i ) cs,c I i+gsize)+di 
endif 

else 

p <_ muKgsize.h.ds.d.i.cs.cj) + di 

endif 

endcase 
case rnd of 
N: 

s<-0 h - r || ~Pr II P r r" 1 

Z: 

s <- 0 h " r || pf,--i 

F: 

s 0 h 

C: 

s^0 h - r || 1 r 

endcase 

v^((zs&p h -i)l|p) + (0||s) 

if (v h resize = (zs & v r+fsize -l) h+1 - r - fsize ) or not (I and (op = E. EXTRACT)) then 
w <- (zs & v r+fs ize-l) zsize " fsize ' dp0S It Vfsize-1+r..r II 0 d P os 

else 

w <- (zs ? (v h || -vfi size - d P° s - 1 ) : iwize-dpos) || 0 d P os 

endif 

Zzsize-1+k..k <- w 
endfor 

RegWrite(rd t 128, z) 
enddef 



f , c . ( ? G 




Operation codes 



E.MULX 


Ensemble multiply extract , . , 


E. EXTRACT 


Ensemble extract 


E.SCALADD.X 


Ensemble scale add extract 



Format 

E.op ra=rd ) rc,rb 
ra=eop(rd,rc,rb) 



31 


24 


23 


18 


17 


12 


11 


6 


5 
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E.op 


rd 


rc 


rb 


ra 
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ra(128) 


0 



Ensemble complex multiply extract doublets 



The ensetnble-multiply-extract instructions (E.MUL.X), when the x bit is set, 
multiply the low-order 64 bits of each of the rc and rb registers and produce 
extended (double-size) results. 
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Ensemble scale add extract doublets 



if 



2P^ 
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7 



n 
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ra(128) 

Ensemble complex scale add extract doublets 
fields and produce extended (doubl! "***" " y the rb re S i «"- 
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Ensemble extract 



Definition 

def muKsizeAvs.v.i.ws.wj) as 

mul <- ((vs&v size -i + i)h-size „ Vsto . 1+U ) * ((ws&w si2e -i +j ) h - size || w si2e -i +i j) 
enddef J " J 

def EnsembleExtract(op,ra,rb,rc,rd) as 
d <- RegRead(rd, 128) 

c <- RegRead(rc ( 128) * 
b <- RegRead(rb, 128) )*• 
caseb8..oof 
0..255: 

sgsize <- 128 
2S6..383: 

sgsize <- 64 
384.447: 

sgsize <- 32 
448..479: 

sgsize <- 16 
480..495: 

sgsize 8 
496.. 503: 

sgsize <- 4 
504..507: 

sgsize <- 2 
508.. 511: 

sgsize <- 1 

endcase 
I <- b-|-| 

m <r- b-|2 
n <- bi3 
signed <- bi4 
x<-b 15 
case op of 

E. EXTRACT: 

gsize <- sgsize*(2-(m or x)) 

zsize <- sgsize 

h <- gsize 

as <r- signed 

spos <- (b 8 . 0 ) and (gsize- 1 ) 



f(G 20J 





E.SCALADD.X: 

if (sgsize < 8) then 

gsize <- 8 
elseif (sgsize*(n+1 ) > 32) then 

gsize <-32/(n+1) 

else 

gsize <- sgsize 

endif 

ds <- cs <- signed 
bs <- signed A m 
as <- signed or m or n 
zsize <-gsize*(x+1) 
h (2*gsize) + 1 + n 
spos <- (bs..o) and (2*gsize-1) 
E.MULX: 

if (sgsize < 8) then 

gsize <r~ 8 
elseif (sgsize*(n+1)*(x+1) > 128) then 

gsize <- 128/(n+1)/(x+1) 

else 

gsize <- sgsize 

endif 

ds <- signed 

cs <- signed A m 

as <r- signed or m or n 

zsize <- gsize*(x+1) 

h <- (2*gsize) + n 

spos <- (b 8 ..o) and (2*gsize-1) 

endcase 

dpos <- (0 || b23..i6) and (zsize-1) 
r <- spos 

sfsize <- (0 || b 31 ..24) and (zsize-1) 

tfsize <- (sfsize = 0) or ((sfsize+dpos) > zsize) ? zsize-dpos : sfsize 
fsize <~ (tfsize + spos > h) ? h - spos : tfsize 
lf ( b 10..9 = Z) and not as then 
rnd «- F 

else 

rnd <- bio..9 



endif 



^V6 . 2 o J 



for j <r- 0 to 1 28-zsize by zsize 
j <_ j*gsize/zsize 
case op of 

E. EXTRACT: 

if m or x then 

P *~ dgsize+i-1..i 

else 

P ^— (CJ || C)g S i Ze +i-1..i 

endif 

E.MULX: 4 i 
if n then 

if (i and gsize) = 0 then 

p <r- mui(gsize,h l ds ( d ( i f cs l c ( i)- 
muKgsize.h.ds.dj+gsize.cs.ci+gsize) 

else 

muKgsize.h.ds.d.i.cs.cJ+gsizeJ+muKgsize.h.ds.d.i.cs.cJ+gsize) 

endif 

else 

p <r- muKgsize.h.ds.d^cs.c.i) 

endif 
E.SCALADD.X: 
if n then 

if (i and gsize) = 0 then 

p <~ mul(gsize ( h ) ds,d ( i,bs ( b,64+2*gsize) 
+ muKgsizeAcs.c.i.bs.b^) 

- mul(gsize,h,ds 1 d,i+gsize I bs 1 b 1 64+3*gsize) 

- mul(gsize,h,cs ) c,i+gsize I bs,b I 64+gsize) 

else 

p <r~ mul(gsize t h 1 ds,d,i,bs 1 b,64+3*gsize) 
+ mul(gsize,h 1 cs,c I i,bs,b,64+gsize) 
+ mul(gsize,h,ds,d I i+gsize t bs,b ( 64+2*gsize) 
+ mul(gsize ) h,cs,c,i+gsize,bs l b,64) 

endif 

else 

p <- mul(gsize ( h t ds,d 1 i t bs ( b 1 64+gsize) + mul(gsize 1 h ) cs 1 c,i l bs,b 1 64) 

endif 

endcase 
case rnd of 
N: 





s <- 0 h 


Z: 






s <-0 h 


F: 






s <- 0 h 


C: 






s <-0 h 


endcase 







2 o 



v^((as&p h . 1 )||p) + (0||s) 

'f (Vh..r+fsize = (as & v r+fsize . 1 ) h+1 - r - fsiz e) or not (I and (op = E. EXTRACT}) then 
w <- (as & v r+f size-i) 2size - fsize - d P 0S || Vfsize-1+r..r II 0 d P° s 



if m and (op = E. EXTRACT) then 

Zzsize-1+j..j <- Casize-l+j.-dpos+fsize+j II Wdpos+fsize-L.dpos II Cdpos-1+j..j 

else 

Zzsize-1+j..j<~w 

endif 
endfor 

RegWrite(ra, 128, z) 



else 



w <- (s ? (Vh || -vpiz©-dpos-1) ; izsize-dpos) y gdpos ; * ; \ 



endif 



enddef 
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Gateway with pointers to code and data spaces 
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Typical dynamic-linked, inter-gateway calling sequence: 
caller: 

caller A.ADDI sp@-size //allocate called stack frame 

S.I.64.A Ip.sp.off 

S.I.64.A dp.sp.off 

L.I.64.A lp=dp.off // load Ip 

LI.64.A dp=dp ( off //load dp v 
B.GATE 

LI.64.A dp.sp.off 
... (code using dp) 

L- 1.64. A lp=sp,off // restore original Ip register 

A.ADDI sp=size // deallocate caller stack frame 

8 Ip // return 



callee (non-leaf): 

calee: LI.64.A 
S.L64.A 
LI.64.A 
S.I.64.A 
S.I.64.A 
... (using dp) 
LI.64.A 

... (code using dp) 
L.I.64.A 
LI.64.A 
B.DOWN 

callee (leaf, no stack): 



callee: 



... (using dp) 
B.DOWN 



dp=dp,off 

sp.dp.off 

sp=dp,off 

Ip.sp.off 

dp.sp.off 

dp.sp.off 

lp=sp,off 
sp=sp,off 

Ip 



// load dp with data pointer 
// new stack pointer 



// restore original Ip register 
// restore original sp register 



(((, . r > 




Operation cnriftg 




Format 

B.GATE rb 
bgate(rb) 





Branch gateway 



Definition 



def BranchGateway(rd,rc,rb) as 
c <- RegRead(rc, 64) 
b <- RegRead(rb, 64) 
if (rd*0) or (rc*1)then 

raise ReservedlnstrucBon 

endif 

ifcz.o^Othen 

raise AccessDisallowedByVirtualAddress 

endif 

d <- ProgramCounter63..2+1 II PrivilegeLevel 
if PrivilegeLevel < bi.. 0 then 

m <r- LoadMemoryG(c,c,64 t L) 
if b * m then 

raise GatewayDisallowed 

endif 

PrivilegeLevel <- b-j.o 

endif 

ProgramCounter <~ b63..2 || 0 2 
RegWrite(rd, 64, d) 
raise TakenBranch 
enddef 



Exceptions 

Reserved Instruction 
Gateway disallowed 
Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 



Operation codes 



E.SCALADD.F.16 


Ensemble scale add floating-point half 


E.SCAL.ADD.F.32 


Ensemble scale add floating-point single 


E.SCAL.ADD.F.64 


Ensemble scale add floating-point double 



Selection 



class 


op 


prec 


scale add 


E.SCAL.ADD.F 


16 32 64 



Format 

E. op. prec ra=rd,rc,rb 
ra=eopprec(rd,rc,rb) 



31 24 


23 


18 17 


12 


11 


6 


5 




0 


E.op.prec 




rd I 


rc 


rb 


ra 


8 




6 


6 




6 




6 
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Definition 

def EnsembleFloatingPointTernarytop.precrd.rc.rb.ra) as \ 
d <- RegRead(rd, 128) 
c <- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
f or i <- 0 to 1 28-prec by prec 

di F(prec,d j+ pr eo-1 ..i) 

ci <r- F(prec ( Cj+prec-1..i) 

ai <- fadd(fmul(di, F(prec,b pre ol..o)). fmul(ci. F(prec l b 2 * P rec-l..prec))) 
ai+prec-L.i <- PackF(prec, ai, none) 
endfor 

RegWrite(ra ( 128, a) 



enddef 
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Operation codes 



G. BOOLEAN 



Group boolean 



Selection 



operation 
d 



function (binar 
11110000 



11001100 



funct ion (decimal) 
240 



204 



10101010 



176 



d&c&b 



10000000 



128 
234 



(d&c)|b 



11101010 



dlclb 



11111110 



254 
202 



d?c:b 



11001010 



d A c A b 



10010110 



150 



-d A c A b 



01101001 



105 
0 



00000000 



Format 

G. BOOLEAN rd@trc,trb,f 
rd=gbooleani(rd,rc,rb,f) 

31 25 2423 18 17 

I G.BOOLEAN |ih| rd I rc 



if f6=f5 then 

if f2=f 1 then 

if f2 then 

rc <r- max(trc,trb) 
rb <- min(trc,trb) 

else 

rc <- min(trc,trb) 
rb <- max(trc,trb) 

endif 
ih <- 0 

'l<-0||f6||f7||f4||f3l|f0 

else 

if f2 then 

rc <- trb 
rb <- trc 

else 

rc <r- trc 
rb <- trb 

endif 
ih <-0 

«<-1 Hf6l|f7l|f4l|f3l|fo 

endif 

else 

ih <- 1 
if f6 then 

rc <- trb 

rb <- trc 

'l<-fl II f2 II f7 II U II f3 II fo 

else 

rc <- trc 
rb <- trb 

'«<-f2l|fl II f7 II f4 II f3 II fo 

endif 

endif 



Definition 

def GroupBoolean (ih.rd.rc.rbjl) 
d <- RegRead^d, 128) 
c <- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
if ih=0 then 

if il5=0 then 

f*-h II »U II "4 I|il2 II "1 II (rorb)2 || 

else 

f*-«3 II 1(4 II iUII «2 II "1 H 0 If 1 |f il 0 

endif 

else 

f ^— "3 II 0 H -f || il 2 || ill || il 5 || il 4 || H 0 

endif 

fori<-0to127 by size 

3i <" f (di||Ci||bi) 

endfor 

RegWrite(rd t 128, a) 
enddef 



Operation codes 



I B.HINT 



Branch Hint 



Format 

B.HINT badd,count,rd 
bhint(badd,count,rd) 



31 



24 23 



1817 



1211 



6 5 



B.HINT | 



B.MINOR 



rd 



| count 



simm 



8 



simm <— badd-pc-4 



Definition 

def BranchHint(rd,countsimm) as 
d <- RegRead(rd, 64) 
'UdL.oJ^Othen 

raise AccessDisallowedByVirtualAddress 

endif 



enddef 



FetchHint(ProgramCounter +4 + (0 1| simm || 0\ d 63 2 1| 



F(6 . 2-^ 



Exceptions 

Access disallowed by virtual address 



Operation codes 



X2 CTXTV T? 1 (\ 


Ensemble convert floating-point doublets from half nearest default 


P CITNTK' V 1 (\ C 

il.oLLNJV.r . 1U.^ 


Ensemble convert floating-point doublets from half ceiling 


P QTHTK" V 1 f\ C Fl 

xi.oiXNJv.r . lu.ujj 


Ensemble convert floating-point doublets from half ceiling default 


T2 CTXTV T7 1 fi V 


Ensemble convert floating-point doublets from half floor 


12 CTXTV 17 1 A T7 T% 


Ensemble convert floating-point doublets from half floor default 


13 CTXTV T? 1 A \T 

E.L>LNlv.r.lO.JN 


Ensemble convert floating-point doublets from half a^arest 


T? CTXTV r 1/C V 

E.olNK*r.lO.A 


Ensemble convert floating-point doublets from lialf exact 


X2 CTXTV 17 1 A 7 

b.oiJNls«r.lO.Z# 


Ensemble convert floating-point doublets from half zero 




Ensemble convert floating-point doublets from half zero default 


T7 CTXTV T? 


Ensemble convert floating-point quadlets from single nearest default 


t? CTXTV 17 lO f 


Ensemble convert floating-point quadlets from single ceiling 


13 CTXTV I? 10 C* T\ 


Ensemble convert floating-point quadlets from single ceiling default 


13 CTXTV 17 10 T7 


Ensemble convert floating-point quadlets from single floor 


13 CTXTV 17 10 T7 "T* 


Ensemble convert floating-point quadlets from single floor default 


"C CTXTV 17 10 XT 


Ensemble convert floating-point quadlets from single nearest 


13 CTXTV 17 10 Y 


Ensemble convert floating-point quadlets from single exact 


13 CTXTV T7 10 7 


Ensemble convert floating-point quadlets from single zero 


17 CTXTV 17 10 7 T\ 


Ensemble convert floating-point quadlets from single zero default 


17 CTXTV 17 A/1 


Ensemble convert floating-point octlets from double nearest default 


T7 CTXTV 17 AA P 1 


Ensemble convert floating-point octlets from double ceiling 


T7 CTXTV T7 A/i Pi 


Ensemble convert floating-point octlets from double ceiling default 


17 CTXTV T7 A/1 17 

E.MJNis..r .04. r 


Ensemble convert floating-point octlets from double floor 


TT CTXTV 17 A /I T7 'Pi 

c.olJNiv.r .04. r .U 


Ensemble convert floating-point octlets from double floor default 


b.olINrL.r.04.JN 


Ensemble convert floating-point octlets from double nearest 


T"? CTXTT/" T? A /I V 

E.SINK.r .64.X 


Ensemble convert floating-point octlets from double exact 


E.MNK.r.64.Z, 


Ensemble convert floating-point octlets from double zero 


E. SIN Jv.r. 64. Z..D 


Ensemble convert floating-point octlets from double zero default 


E.MNrv.r. lzo 


Ensemble convert floating-point hexlet from quad nearest default 


E.SINK.r.lzo.U 


Ensemble convert floating-point hexlet from quad ceiling 


E.olNJv.r. lzo.U.U 


Ensemble convert floating-point hexlet from quad ceiling default 


E.SINK.F.128.F 


Ensemble convert floating-point hexlet from quad floor 


E.SINK.F.128.F.D 


Ensemble convert floating-point hexlet from quad floor default 


E.SINK.F.128.N 


Ensemble convert floating-point hexlet from quad nearest 


E.SINK.F.128.X 


Ensemble convert floating-point hexlet from quad exact 


E.SINK.F.128.Z 


Ensemble convert floating-point hexlet from quad zero 


E.S1NK.F.128.Z.D 


Ensemble convert floating-point hexlet from quad zero default 




Selection 





op 


prec 


round/trap 


integer from float 


SINK. 


16 32 64 128 


noneCFNXZCD 
F.D Z.D 



Format 



E.SINKF.prec.rnd rd=rc 



rd=esinkfprecmd(rc) 

31 24 23 



18 17 



12 11 



6 5 



E.prec 



rd 



rc 



8 



E.SINK.F.rnd 



E.UNARY 



# 



# 



253° 



Definition 

def EnsembleSiiikFloatingPoint(prec,round,rd,rc) as 

c <- RegRead(rc, 128) 

for i <r~ 0 to 128-prec by prec 
ci«-F(prec,c i+pr ec-l..i) 
a i+prec-l..i «~ fsinkr(prec, ci, round) 

end for 

RegWrite[rd, 128, a] 
enddef 



r. 




Exceptions 
Floating-point arithmetic 



Definition 

def eb <- ebits(prec) as 
case pref of 
16: 

eb<-5 

32: 

eb<- 8 

64: 

eb<- 11 

128: 

eb<- 15 

endcase 

enddef v 

def eb <- ebias(prec) as 

eb <— 0 || iebits(prec)-l 
enddef 

def fb <— fbits(prec) as 

fb <— prec - 1 - eb 
enddef 

def a <— F(prec, ai) as 
a.s <— aip rec _j 

ae <— aip rec _2..fbits(prec) 
af <-aifbits(prec)-L.O 
ifae= lebits(prec) then 
if af =0 then 

a.t <- rNFFNITY 
elseif affbit s (prec)-l then 
a.t <r- SNaN 
a.e < — fbits(prec) 
a.f <- 1 || affbits(prec)-2..0 

else 

a.t <- QNaN 

a.e <— -fbits(prec) 

a.f <- af 

endif 
elseif ae = 0 then 
if af = 0 then 

a.t <- ZERO 

else 

a.t <- NORM 

a.e <— 1 -ebias(prec)-fbits(prec) 
a.f <— 0 || af 

endif 

else 

a.t <- NORM 

a.e <— ae-ebias(prec)-fbits(prec) 
a.f <- 1 || af 



endif 



enddef 



■ r^- 2 s D 



7>7» 



def a <r- DEFAULTQNAN as 

a.s<-0 

a.t <- QNAN 

a.e <r- -1 

a.f<-l 
enddef 

def a <- DEFAULTSNAN as 
a.s <-0 
a.t «- SNAN 
a.e 

a.f<- 1 
enddef 



def fadd(a,b) as faddr(a,b,N) enddef 

def c <- faddi(a,b,round) as 

if a.t=NORM and b.t=NORM then 

// d,e are a,b with exponent aligned and fraction adjusted 
if a.e > b.e then 

d <r~ a 

e.t<-b.t . 

e.s <- b.s 

e.e <- a.e 

e.f <-b.f ||0 a - e - b - c 
else if a.e < b.e then 

d.t <r- a.t 

d.s <— a.s 

d.e <r- b.e 

d.f <-a.f|| 0 b e " a e 

e <r~ b 

endif 
c.t <- d.t 
c.e <— d.e 
if d.s = e.s then 

c.s <- d.s 

c.f <-d.f +e.f 
elseif d.f > e.f then 

c.s <- d.s 

c.f <- d.f- e.f 
elseif d.f < e.f then 

c.s <— e.s 

c.f <- e.f- d.f 

else 

c.s <— r=F 
c.t <- ZERO 

cndif 




if priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 
c <— b 

elseif (a.t=SNAN) or (a.t=QNAN) then 
c <— a 

elseif a.t=ZERO and b.t=ZERO then 
c.t <- ZERO 

c.s <r- (a.s and b.s) or (round=F and (a.s or b.s)) 
// NULL values are like zero, but do not combine with ZERO to alter sign 
elseif a.t=ZERO or a.t=NULL then 

c <— b 

elseif b.t=ZERO or b.t=NULL then 
c <-a 

elseif a.t=INFINITY and b.t=INFINITY then 
if a.s * b.s then 



endif 

elseif a.t=INFINITY then 

c <r- a 

elseif b.t-INFINITY then 
c <— b 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 

def b <r- fneg(a) as 

b.s <— -a.s 

b.t <r- a.t 

b.e <— a.e 

b.f^a.f 
enddef 

def fsub(a,b) as fsubr(a,b,N) enddef 

def fsubr(a,b,round) as faddr(a,fneg(b),round) enddef 

def frsub(a,b) as frsubr(a } b,N) enddef 



c <r~ DEFAULTSNAN // Invalid 



else 



c <— a 



def frsubr(a,b,round) as faddr(fneg(a),b,round) enddef 



V 




def c <— fcom(a,b) as 

if (a.t=SNAN) or (a.t=QNAN) or (b.t=SNAN) or (b.t=QNAN) then 
c«-U 

elseif a. ^INFINITY and b.t=INFINiTY then 
if a.s * b.s then 

c <- (a.s=0) ? G: L 

else 

c <— E 

endif 

elseif a.t=INFINITY then 

c <- (a.s=0) ? G: L 
elseif b.t=INFINITY then 

c <r- (b.s=0) ? G: L 
elseif a.t=NORM and b.t=NORM then 

if a.s * b.s then 

c <- (a.s=0) ? G: L 

el^e 

if a.e > b.e then 
af <~ a.f 

bf<-b.f|| 0"-^ 

else 

af <~ a.f It 0 be ' ae 
bf<-b.f 

endif 

if af = bf then 
c <— E 

else 

c <— ((a.s=0) A (af > bf)) ? G : L 

endif 

endif 

elseif a.t=NORM then 

c <r- (a.s=0) ? G: L 
elseif b.t=NORM then 

c <- (b.s=0) ? G: L 
elseif a.t=ZERO andb.t-ZERO then 

c <— E 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 



• - • f 1 ** 

def c <- finul(a,b) as 

if a,t=NORM and b.t=NORM then 

as <- a.s A b.s 

c.t <- NORM 

c.e <- a.e + b.e 

c.f<-a.f*b.f 
// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 

c.s <- a.s A b.s 

at <- b.t / 

c.e <- b.e 

c.f<-b.f 

elseif (a.t=SNAN) or (a.t=QNAN) then 
c.s <r- a.s A b.s 
c.t <- a.t 
c.e <- a.e 

..c.f<-a.f ^ 
elseif a.t=ZERO and b.t=INFINITY then 

c <- DEFAULTSNAN // Invalid * 
elseif a.t=INFINITY and b.t=ZERO then 

c <- DEFAULTSNAN // Invalid 
elseif a.t=ZERO or b.t=ZERO then 

c.s <- a.s A b.s 

c.t <- ZERO 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 



i" 



defc«-fdivr(a,b)as 

if a.t=NORM and b.t=NORM then 

c.s <— a.s A b.s 

c.t <- NORM 

c.e <- a.e - b.e + 256 

c.f<-(a.f||0 256 )/b.f 
// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 

c.s <— a.s A b.s 

c.t<-b.t 

c.e <— b.e 

c.f<-b.f 

elseif (a.t=SNAN) or (a.t=QNAN) then 

c.s «- a.s A b.s 

c.t <— a.t 

c.e <— a.e 

c.f <— a.f 
elseif a.t=ZERO and b.t=ZERO then 

c <- DEFAULTSNAN // Invalid 
elseif a.t-INFINITY and b.t=INFINTTY then 

c <- DEFAULTSNAN // Invalid 
elseif a.t=ZERO then 

c.s <— a.s A b.s 

c.t <- ZERO 
elseif a.t=INFINITY then 

c.s <— a.s A b.s 

c.t <- INFINITY 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 

def msb <- findmsb(a) as 

MAXF <- 2 18 // Largest possible f value after matrix multiply 
forj <- 0 to MAXF 

ifaMAXF-l..j = (0 MAXF - U j 111) then 
msb <— j 

endif 
endfor 
enddef 




def ai <— PackF(prec,a,round) as 
case a.t of 
NORM: 

msb <— findmsb(a.f) 

rn msb-l-fbits(prec) // lsb for normal 

rdn <r- -ebias(prec)-a.e-l-fbits(prec) // lsb if a denormal 

rb <— (rn > rdn) ? rn : rdn 

if rb <, 0 then 

aifr<-a.f msb .i„ 0 ||0- rb t 
eadj <~ 0 \r : 

else 

case round of 
C: 

s <- Q™sb-rb || (_ a<s )rb 

F: 

s <- O^sb-rb || ( a s )rb f 
N, NONE: * 
s< _0«isb-ib ||^ a .f rb || a4^1 

X: 

ifa.f r b-i..o^Othen 

raise FloatingPointArithmetic // Inexact 

endif 
s <- 0 

Z: 

s <-0 

endcase 

v <- (0||a.f msb „ 0 ) + (0||s) 
if v msb = 1 then 

aifr<- v msb . L r5 

eadj <- 0 

else 

aifr <~ Qfh\ts(pree) 
eadj <r- 1 

endif 

endif 

aien <- a.e + msb - 1 + eadj + ebias(prec) 
if aien < 0 then 

if round = NONE then 

ai <- a.s || 0 ebits (P rec ) || aifr 

else 

raise FloatingPointArithmetic //Underflow 

endif 

elseifaien> lebits(prec) ^ 
if round = NONE then 

//default: round-to-nearest overflow handling 
ai <_ a.s || lebits(prec) y 0 fbits(prec) 

else 

raise FloatingPointArithmetic //Overflow 

endif 

else 

ai <- a.s || aien c bit s (prec)-1..0 II aifr 

endif 



Pi <->.. I ■' 



SNAN: 

if round * NONE then 

raise FloatingPointArithmetic //Invalid 

endif 

if-a.e < fbits(prec) then 

ai <- a.s || lebits(prec) || a.f_a. e -1..0 II O^i^P^^a e 

else 

lsb <- a.£a. e -l-fbits(precH »0 * 0 

ai <_ a . s [| ie bits(prec) „ a.^e-L.-ae-l-fbitsCprec^ II kb 

endif 
QNAN: 

if -a.e < fbits(prec) then 

ai <- a.s || iebits(prec) || a .ta.c-I..O || 0 fbits(prec)+a.e 
else : 

lsb <- a.f.a. e -l-fbits(prec)+l ..0 * 0 

ai <- a.s || lebits(prec) y a.f. a . e .i...a.e-l.fbits(prec) + 2 II lsb 

endif 
ZERO: 

ai <- a.s || 0 ebits (P rec ) || O^^P^) 
INFINITY: 

ai <- a.s || lebits(prec) || Qfbits(prec) 

endcase 



def ai <- fsinkr(prec, a, round) as 
case a.t of 
NORM: 

msb <— findmsb(a.f) 
rb <- -a.e 
ifrb^Othen 

aifr<-a.f msb ^ 0 || <r rb 
aims <— msb - rb 

else 

case round of 
C, CD: 

s <- 0 msb - rb || (~ai. s ) rb 
F, F.D: 

s <_ Qmsb-rb j| ( ai s yb 
N, NONE: 

s< _0^b-rb|,^ ifrb| | ai ^.i 

X: 

ifai.f r b-i ..o*0 then 

raise FloatingPointArithmetic // Inexact 

endif 
s <-0 
Z, Z.D: 

s <-0 

endcase 

v^(0||a.f msb .. 0 ) + (0||s) 
ifv msb = 1 then 

aims <- msb + 1 - rb 

else 

aims <- msb - rb 

endif 

aifr <" v aims< rb 

endif 

if aims > prec then 
case round of 

CD, F.D, NONE, Z.D: 

ai«-a.s || as)P rec " 1 
C, F, N, X, Z: 

raise FloatingPointArithmetic // Overflow 

endcase 
. elseif a.s = 0 then 
ai <r- aifr 

else 

ai <- -aifr 

endif 
ZERO: 

ai <- 0P rec 
SNAN, QNAN: 

case round of 

CD, F.D, NONE, Z.D: 

ai <- 0P rec 
C, F, N, X, Z: 

raise FloatingPointArithmetic // Invalid 



• • - s z * 7 > 

endcase 
INFINITY: 

case round of 

CD, F.D, NONE, Z.D: 

ai <- a.s || (~as)P rec -l 
C, F, N, X, Z: 

raise FIoatingPointArithmetic // Invalid 

endcase 

endcase 
enddef 

— - • - - •• i 



def c 4- frecrest(a) as 

b.s <r-0 

b.t <- NORM 

b.e 0 1 \ 

b.f <- 1 

c <- fest(fdiv(b,a)) 
enddef 

def c <r- frsqrest(a) as 
b.s^O 
b.t <r- NORM 
b.e <- 0 
b.f<- 1 

c <- fest(fsqr(fdiv(b,a))) 
enddef 

def c <— fest(a) as 

if(a.t=NORM)then 

msb <— findmsb(a.f) 
a.e <— a.e + msb - 13 
a.f <- a.f ms b.. ms b-I2 II 1 

else 

c <— a 

endif 
enddef 



def c <r~ fsqr(a) as 

if (a.t=NORM) and (a.s=0) then 
c.s <— 0 
c.t <- NORM 
if (a.eo = 1) then 

c.e<-(a.e-127)/2 
c.f<-sqr(a.f ||0 127 ) 

else 

c.e <-(a.e-128)/2 
c.f<-sqr(a.f ||0 128 ) 

endif 

elseif (a.t=SNAN) or (a.t=QNAN) or a.t=ZERO or ((a.t=INFINITY) and (a.s=0)) then 
c <r- a 

elseif ((a.t=NORM) or (a.t=INFINITY)) and (a.s=l) then 
c <r- DEFAULTSNAN // Invalid 

else 

assert FALSE // should have covered ai the cases above 

endif 
enddef 



Format 

G.op.size rd=rc,rb 

rd=gopsize(rc,rb) 
31 



24 23 



18 17 



12 11 



6 5 



G.size 
8 



rd 



rc 



rb 



op 



Fig. 26B 



# 



Operation codes 



G.ADD.8 


Group add bytes 


G.ADD.16 


Group add doublets 


G.ADD.32 


Group add quadlets 


G.ADD.64 


Group add octlets 


G.ADD.128 


Group add hexlet 


G.ADD.L.8 


Group add limit signed bytes 


G.ADD.L.16 


Group add limit signed doublets 


G.ADD.L.32 


Group add limit signed quadlets 


G.ADD.L.64 


Group add limit signed octlets 


G.ADD.L.128 


Group add limit signed hexlet 


G.ADD.L.U.8 


Group add limit unsigned bytes 


G.ADD.L.U.16 


Group add limit unsigned doublets 


G.ADD.L.U.32 


Group add limit unsigned quadlets 


G.ADD.L.U.64 


Group add limit unsigned octlets 


G.ADD.L.U.128 


Group add limit unsigned hexlet 


G.ADD.8.0 


Group add signed bytes check overflow 


G.ADD.16.0 


Group add signed doublets check overflow 


G.ADD.32.0 


Group add signed quadlets check overflow 


G.ADD.64.0 


Group add signed octlets check overflow 


G.ADD.128.0 


Group add signed hexlet check overflow 


G.ADD.U.8.0 


Group add unsigned bytes check overflow 


G.ADD.U.16.0 


Group add unsigned doublets check overflow 


G.ADD.U.32.0 


Group add unsigned quadlets check overflow 


G.ADD.U.64.0 


Group add unsigned octlets check overflow 


G.ADD.U.128.0 


Group add unsigned hexlet check overflow 



Fig. 26A 




# 



Definition 

def CrossbarCopjSizejrdjrCjrb) 
c <r- RegRead(rc } 128) 
b <r- RegRead(rb, 128) 
shift <— b and (size-1) 
case 0P5. 2 II 0 2 of 
X.COMPRESS: 

hsize <— size/2 

for i <- 0 to 64-hsize by hsize 
if shift < hsize then 

a i+hsize-l..i c i+i+shift+hsize-l..i+i+shift 

else 

ai+hsize-l..i <- cf+l+slze-^ II c i+i+size-l..i+i+shift 

endif 
endfor 
ai27..64<-0 
X.COMPRESS.U: 
hsize <- size/2 

for i <- 0 to 64-hsize by hsize 
if shift < hsize then 

a i+hsize-l..i *~ c i-H+shift+hsize-l..i+i+shift 

else 

ai+hsize-l..i <- 0 shift ' hsize || Ci+j +s j ze -i..i+i+ s hift 

endif 
endfor 

ai27..64^0 
X.EXPAND: 

hsize <- size/2 

for i <- 0 to 64-hsize by hsize 
if shift < hsize then 

ai+i+size-l..i+i <- c{Vtelli ft II c i+hs i 2e .i..i II 0 shift 

else 

ai+i+size-l..i+i <~ c i+size-shift-l..i II ° shlft 

endif 
endfor 
X.EXPAND.U: 

hsize <- size/2 

for i <- 0 to 64-hsize by hsize 
if shift < hsize then 

a i+ i + size-l..i + i <- O^ize-shift „ c i+hsize _, j || 0*"* 

else 

ai+i+sizc-l..i+i <- c i+size-shift-l..i II ° shlft 

endif 
endfor 
X.ROTL: 

for i <- 0 to 128-size by size 

a i+size-l..i *~ c i+size-l -shift.. i II c i+size-l..i+size-l -shift 
endfor 



Fig. 32C 



X.ROTR: 

for i <- 0 to 128-size by size 

a i+size-l..i *~ c i+shift-l..i II c i+size-l..i+shift 
endfor 
X.SHL: 

for i <- 0 to 128-size by size 

a i+size-l..i *~ c i+size-l -shift.. i II 0 S ^ 
endfor 
X.SHL.O: 

for i <— 0 to 128-size by size 

if Ci+size- 1 . .i+size- 1 -shift * c f+sizei 1-shift then 
raise FixedPointArithmetic 

endif 

a i+size-l..i *~ c i+size-l -shift.. ill 
endfor 
X.SHL.U.O: 

for i <- 0 to 128-size by size 

if Ci+size-l..i+size-shift * 0 shift then 
raise FixedPointArithmetic 

endif 

a i+size-l..i *~ c i+size-l -shifL.il I 0 s * 1 '** 
endfor 
X.SHR: 

for i <- 0 to 128-size by size 

a i+size-l..i *~ c ?+size-l II c i+size-l..i+shift 
endfor 
X.SHR.U: 

for i <r~ 0 to 128-size by size 

a i+size-l..i <~ ° shlft II c i+size-l..i+shift 
endfor 

endcase 

RegWrite(rd, 128, a) 
enddef 



Fig. 32C (cont'd) 




Fig. 32D 



Format 

X.EXTRACT ra=rd,rc,rb 

ra=xextract(rd,rc,rb) 

31 24 23 18 17 12 11 65 0 

op | rd | rc | rb | ra 

8 6 6 6 6 



Fig. 33A 



# • 

Fig. 33B 



i 



# 



Definition 

def CrossbarExtract(op,ra,rb,rc,rd) as 



d<- 


RegRead(rd, 128) 


c <— 


RegRead(rc, 128) 


b<- 


RegRead(rb, 128) 


case 


b 8..0 of 




0..255: 




gsize <- 128 




256.383: 




gsize <— 64 




384..447: 




gsize <— 32 




448..479: 




gsize <- 16 




480..495: 




gsize <- 8 




496..503: 




gsize <- 4 




504..507: 




gsize <— 2 




508..511: 




gsize <- 1 



endcase 
m <r- b\2 

as <- signed <- b\4 
h <- (2-m)*gsize 

spos <- (bg. o) anc * ((2-m)*gsize-l) 
dpos <- (0 || b23..i6) and (gsize- 1) 
sfsize <- (0 || b3i„24) and (gsize- 1) 

tfsize <r- (sfsize = 0) or ((sfsize+dpos) > gsize) ? gsize-dpos : sfsize 
fsize <- (tfsize + spos > h) ? h - spos : tfsize 
for i <- 0 to 128-gsize by gsize 
case op of 

X.EXTRACT: 
if m then 

P dgsize+i-l..i 

else 

P<-(d|| c)2*(gsize+i)-1..2*i 

endif 

endcase 

v<-(as&p h _i)||p 

w ^ (as & v spos+fsize -i)g size - fsize - d P os || Vf siz e.i +S pos..s P os II 0 d P os 
if m then 

a size-l+i..i c gsize-l+i..dpos+fsize+i II w dpos+fsize-l..dpos II c dpos-l+l..i 

else 

a s ize-l+Li*- w 

endif 
endfor 

RegWrite(ra, 128, a) 
enddef 



fsize -x — spos 



2*gsize 
< — 




rd | rc| | rb 



ab 



fsize -x dpos - 



Crossbar extract 



Fig. 33C 



fsize -x — spos 

















'4* £• -ill 





rc 




<- fsize dpos — 
Crossbar merge extract 



Fig. 33D 



X.SHUFFLE.4 


Crossbar shuffle within pecks 


X.SHUFFLE.8 


Crossbar shuffle within bytes 


X.SHUFFLE.16 


Crossbar shuffle within doublets 


X. SHUFFLE. 3 2 


Crossbar shuffle within quadlets 


X.SHUFFLE.64 


Crossbar shuffle within octlets 


X.SHUFFLE.128 


Crossbar shuffle within hexlet 


X.SHUFFLE.256 


Crossbar shuffle within triclet 



Fig. 34A 



Format 

X.SHUFFLE.256 rd=rc,rb,v,w,h 
X.SHUFFLE.size rd=rcb,v,w 

rd=xshuffle256(rc,rb,v,w,h) 
rd=xshufflesize(rcb,v,w) 

3] 24 23 18 17 12 II 65 0 

| X.SHUFFLE | rd | rc | rb | op 

8 6 6 6 6 

rc <— rb <— rcb 
x<-log2(size) 
y<-log2(v) 
z<-log2(w) 

op <- ((x*x*x-3*x*x-4*x)/6-(z*z-z)/2+x*z+y) + (size=256)*(h*32-56) 



Fig. 34B 



Definition 

def CrossbarShuffleOnajor^rCjrt^op) 
c <r- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
if rc=rb then 
case op of 
0..55: 

for x <r- 2 to 7; for y <- 0 to x-2; for z <- 1 to x-y-1 

if op = ((x*x*x-3*x*x-4*x)/6-(z*z-z)/2+x*z+y)then 
fori«-0to 127 

a » *~ C (»6..x II iy+z-l„y II »x-l..y+z II »y-1..0) 

end 

endif 

endfor; endfor; endfor 
S6..63: 

raise Reservedlnstruction 

endcase 

elseif 

case op4„o of 
0..27: 

cb <— c || b 
x <- 8 
h <— op5 

for y <— 0 to x-2; for z <— 1 to x-y- 1 

if °P4..0 = ((17*z-z*z)/2-8+y) then 
fori<-h*128 to 127+h*128 

ai-h*128 <- c b(i y+z . L . y || ix-l..y+z || i y _,.. 0 ) 

end 

endif 
endfor; endfor 
28.3 1: 

raise Reservedlnstruction 

endcase 

endif 

RegWrite(rd, 128, a) 
enddef 



Fig. 34C 



# t 



127 


rcb(128) 




0 




II INtll 1 1 




1 
















„„) 


127 


rd(128) 




0 



4-way shuffle bytes within hexlet 



Fig. 34D 




4-way shuffle bytes within triclet 



Fig. 34E 



